Library load ¶

In [2]:
import scanpy as sc, anndata as ad, numpy as np, pandas as pd
from scipy import sparse
from anndata import AnnData
import warnings
import socket

import seaborn as sns
import itertools
import holoviews as hv
import plotly.express as px
import kaleido
import scanpy.external as sce

from matplotlib import pylab
import sys
import yaml
import os
from pandas.api.types import CategoricalDtype
import plotly
import random


import matplotlib.pyplot as plt
In [3]:
plotly.offline.init_notebook_mode()
In [4]:
get_ipython().run_line_magic('matplotlib', 'inline')
In [5]:
sc.settings.verbosity = 3         # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=50, facecolor='white')
pylab.rcParams['figure.figsize'] = (4, 4)
scanpy==1.8.1 anndata==0.7.6 umap==0.4.6 numpy==1.20.2 scipy==1.6.3 pandas==1.2.4 scikit-learn==0.24.2 statsmodels==0.13.1 python-igraph==0.9.8 louvain==0.7.1 pynndescent==0.5.5
In [6]:
outdir="./outdir"
adataPath = outdir+"/3_polaroid_quickAnno.h5ad"
adataRawPath = outdir+"/1_polaroid_mint.h5ad"
BadhuriPath = outdir+"/NowaCurated.h5ad"
BadhuriRawPath = "./data/Badhuri_atlas.h5ad"

figDir = "./figures"

Mapping annotations ¶

In [7]:
leiden3Mapping = {"0":"POL_Encs-1",
"1":"POL_CBC/BRCs-1",
"2":"POL_EXN-1",
"3":"POL_EnCs-2",
"4":"POL_RGCs-1",
"5":"POL_RGCs-2",
"6":"POL_ccRGCs",
"7":"POL_inPCs",
"8":"POL_EXN-2",
"9":"POL_RtCs",
"10":"POL_ER-Cs",
"11":"POL_CBC/BRCs-2"}


badhuriClMapping = {
    'ipc':"BADH_ipc",
     'interneuron':"BADH_interneuron",
     'dividing':"BADH_dividing",
     'opc':"BADH_opc",
     'radial':"BADH_radial",
     'cr':"BADH_cr",
     'excitatory':"BADH_excitatory",
     'endothelial':"BADH_endothelial"
    
}

BadhuriCategoriesOrder = ["BADH_dividing","BADH_radial","BADH_opc","BADH_ipc","BADH_cr","BADH_excitatory","BADH_interneuron","BADH_endothelial"]
PolaroidsCategoriesorder = ["POL_CRGC","POL_RGC1","POL_RGC2","POL_RCs","POL_EXN1","POL_EXN2","POL_InN","POL_ER","POL_CPC1","POL_CPC2","POL_ChP/CBC/BRC-1","POL_ChP/CBC/BRC-2"]

Adata load ¶

In [8]:
adata = sc.read_h5ad(adataPath)
adata.obs["leidenAnno"] =  adata.obs["leiden0.3"].replace(leiden3Mapping).astype("category").cat.reorder_categories(list(leiden3Mapping.values()))
adata.uns["leidenAnno_colors"] = adata.uns["leiden0.3_colors"].copy()
In [9]:
sc.pl.umap(adata, color=["leiden0.3","leidenAnno"])
In [10]:
adata.raw.to_adata().X.data

adataRaw = sc.read_h5ad(adataRawPath)

adata.X = adataRaw[adata.obs_names, adata.var_names].X
del adataRaw

Badhuri load ¶

In [11]:
adataBadhuri_raw = sc.read_h5ad(BadhuriRawPath)
varMap = adataBadhuri_raw.var.copy()
adataBadhuri_raw = adataBadhuri_raw.raw.to_adata()
adataBadhuri_raw.var = adataBadhuri_raw.var.merge(varMap.drop(columns=["feature_biotype"]), left_index=True, right_index=True, how="left").set_index("feature_name")
adataBadhuri_raw.var.index.name = None


adataBadhuri = sc.read_h5ad(BadhuriPath)
adataBadhuri = adataBadhuri.raw.to_adata()


adataBadhuri.X = adataBadhuri_raw[adataBadhuri.obs_names, adataBadhuri.var_names].X
adataBadhuri.obs["cluster_label_aggr"] = adataBadhuri.obs["cluster_label_aggr"].replace(badhuriClMapping)
del adataBadhuri_raw
/usr/local/lib/python3.8/dist-packages/anndata/_core/anndata.py:794: UserWarning:


AnnData expects .var.index to contain strings, but got values like:
    ['RP11-34P13.7', 'AP006222.1', 'RP4-669L17.4', 'LINC01409', 'LINC00115']

    Inferred to be: categorical


Subset for common genes and conatenation ¶

In [12]:
# Subset for common genes
Commongenes = adataBadhuri.var_names.intersection(adata.var_names)

adataBadhuri_obs_backup = adataBadhuri.obs.copy()
adata_obs_backup = adata.obs.copy()

# Store relevant metadata
adataBadhuri.obs["ledenAnnotated"] = adataBadhuri.obs["cluster_label_aggr"].copy()
adata.obs["ledenAnnotated"] = adata.obs["leiden0.3"].replace(leiden3Mapping).copy()

adataBadhuri.obs["ledenAnnotated_area"] = adataBadhuri.obs["ledenAnnotated"].astype(str) +"_"+ adataBadhuri.obs["cortical_area"].astype(str)
adata.obs["ledenAnnotated_area"] = adata.obs["ledenAnnotated"].astype(str) +"_"+ adata.obs["region"].astype(str)

adataBadhuri.obs = adataBadhuri.obs[["ledenAnnotated","ledenAnnotated_area"]]
adata.obs = adata.obs[["ledenAnnotated","ledenAnnotated_area"]]

adataBadhuri.obs["dataset"] = "Badhuri"
adata.obs["dataset"] = "Polaroids"

#Keep only relevant cols
adataBadhuri.obs = adataBadhuri.obs[["ledenAnnotated","ledenAnnotated_area","dataset"]]
adata.obs = adata.obs[["ledenAnnotated","ledenAnnotated_area","dataset"]]


CombinedAdata = ad.concat([adataBadhuri[:,Commongenes],adata[:,Commongenes]])
/usr/local/lib/python3.8/dist-packages/anndata/_core/merge.py:895: UserWarning:

Only some AnnData objects have `.raw` attribute, not concatenating `.raw` attributes.

Set metadata for aggregation¶

In [13]:
CombinedAdata.obs["ledenAnnotated_area_aggregated"] = CombinedAdata.obs["ledenAnnotated_area"].astype("str").replace({"_proximal":"","_medial":"","_distal":"",
                                                 "_piece1":"","_piece2":"","_piece3":""},
                                                 regex=True).astype("category")
In [14]:
CombinedAdata.layers["mint"] = CombinedAdata.X.copy()

sc.pp.normalize_total(CombinedAdata, target_sum = 20000)
CombinedAdata.layers["libNorm"] = CombinedAdata.X.copy()

sc.pp.log1p(CombinedAdata)
CombinedAdata.layers["libNorm_logNorm"] = CombinedAdata.X.copy()

CombinedAdata_BU = CombinedAdata.copy()


#CombinedAdata_BU.write_h5ad(outdir+"/Correlaton_preBulk.h5ad")
normalizing counts per cell
    finished (0:00:02)

Pseudobulking ¶

In [15]:
PseudooReplicates_per_group = 20
#CombinedAdata_BU = sc.read_h5ad(outdir+"/Correlaton_preBulk.h5ad")


CombinedAdata = CombinedAdata_BU.copy()

groupingCovariate = "ledenAnnotated"

print("Pbulking with target of "+str(PseudooReplicates_per_group)+" PRs will result in following cells per PR")
CombinedAdata.obs.groupby(groupingCovariate).size() / PseudooReplicates_per_group
Pbulking with target of 20 PRs will result in following cells per PR
Out[15]:
ledenAnnotated
BADH_cr               20.55
BADH_dividing       1977.25
BADH_endothelial      14.50
BADH_excitatory     8105.30
BADH_interneuron    3592.70
BADH_ipc            2654.45
BADH_opc             414.90
BADH_radial         2403.95
POL_CBC/BRCs-1       130.70
POL_CBC/BRCs-2        10.70
POL_ER-Cs             20.55
POL_EXN-1            119.40
POL_EXN-2             63.10
POL_EnCs-2           109.70
POL_Encs-1           172.15
POL_RGCs-1            81.90
POL_RGCs-2            77.20
POL_RtCs              24.35
POL_ccRGCs            67.35
POL_inPCs             64.00
dtype: float64
In [16]:
total = pd.DataFrame(index = CombinedAdata.var_names)
total_metadata = pd.DataFrame(index= ["_".join(Rep) for Rep in  list(itertools.product(CombinedAdata.obs[groupingCovariate].unique().tolist(), [str(r)  for r in list(range(PseudooReplicates_per_group))]))])
for group in CombinedAdata.obs[groupingCovariate].unique().tolist():
    groupAdata = CombinedAdata[CombinedAdata.obs[groupingCovariate] == group]
    
    group_cells = groupAdata.obs_names.tolist()
    random.Random(4).shuffle(group_cells)
    
    metaCellslist=[group_cells[i::PseudooReplicates_per_group] for i in range(PseudooReplicates_per_group)]
    
    for partition in enumerate(metaCellslist):
        
        total['{}_{}'.format(group, partition[0])] = CombinedAdata[partition[1]].layers["libNorm"].sum(axis = 0).A1
    
        total_metadata.loc['{}_{}'.format(group, partition[0]),"group"] = group
        total_metadata.loc['{}_{}'.format(group, partition[0]),"pseudoreplicate"] = partition[0]
        total_metadata.loc['{}_{}'.format(group, partition[0]),"number_of_cell"] = int(len(partition[1]))
    
    #With this line we propagate - whenever possible -  obs to aggregated pReplicate
    for obsMD in [obsMD for obsMD in groupAdata.obs.columns.tolist() if len(groupAdata.obs[obsMD].unique()) == 1 and obsMD != groupingCovariate]:
        total_metadata.loc[["_".join(l) for l in list(itertools.product([group],[str(r)  for r in list(range(PseudooReplicates_per_group))]))], obsMD ] = CombinedAdata.obs.loc[CombinedAdata.obs[groupingCovariate] == group,obsMD][0]
total_metadata = total_metadata.dropna(axis = 1)
adata_bulk = sc.AnnData(total.transpose()).copy()
adata_bulk.var = CombinedAdata.var.copy()
adata_bulk.obs = pd.concat([adata_bulk.obs, total_metadata], axis = 1)

sc.pp.normalize_total(adata_bulk, target_sum = 20000)
sc.pp.log1p(adata_bulk)
normalizing counts per cell
    finished (0:00:00)

Integration:considered HVGs are the only common to both datasets ¶

In [17]:
from importlib import reload
import harmony_mod
from harmony_mod import *

reload(harmony_mod)
from harmony_mod import *

npcs=10

adata_bulk_DS = adata_bulk.copy()
#adata_bulk_DS = adata_bulk_DS[adata_bulk_DS.obs.dataset.isin(["Polaroids"])]
HVGs = sc.pp.highly_variable_genes(adata_bulk_DS,n_top_genes=2000,batch_key="dataset", inplace =False)
HVGs = HVGs[HVGs.highly_variable_intersection].index.tolist()

adata_bulk_DS.var["highly_variable"] = False
adata_bulk_DS.var.loc[HVGs,"highly_variable"] = True

adata_bulk_DS = adata_bulk_DS[:,adata_bulk_DS.var["highly_variable"]]



sc.tl.pca(adata_bulk_DS, use_highly_variable=True)
sc.pl.pca_variance_ratio(adata_bulk_DS)

sc.pl.pca(adata_bulk_DS, color = ["group","dataset"], ncols = 1, add_outline=True, size = 20, title="Pre harmony PCA")


#adata_bulk_DS.obsm["X_pca"] = adata_bulk_DS.obsm["X_pca"][:,:npcs]


harmony_integrate_mod(adata_bulk_DS, 'dataset', adjusted_basis ="X_pca", theta = 4, max_iter_harmony = 20)
#sc.pp.neighbors(adata_bulk_DS, n_neighbors=5, n_pcs=10)
sc.pl.pca(adata_bulk_DS, color = ["group","dataset"], ncols = 1, add_outline=True, size = 20, title="Post harmony PCA")
If you pass `n_top_genes`, all cutoffs are ignored.
extracting highly variable genes
... storing 'group' as categorical
... storing 'dataset' as categorical
    finished (0:00:00)
computing PCA
    on highly variable genes
    with n_comps=50
    finished (0:00:00)
/usr/local/lib/python3.8/dist-packages/pandas/core/indexing.py:1637: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

WARNING: The title list is shorter than the number of panels. Using 'color' value instead for some plots.
/usr/local/lib/python3.8/dist-packages/scipy/cluster/vq.py:575: UserWarning:

One of the clusters is empty. Re-run kmeans with a different initialization.

2023-03-13 11:25:11,512 - harmonypy - INFO - Iteration 1 of 20
2023-03-13 11:25:11,512 - harmonypy - INFO - Iteration 1 of 20
2023-03-13 11:25:11,566 - harmonypy - INFO - Iteration 2 of 20
2023-03-13 11:25:11,566 - harmonypy - INFO - Iteration 2 of 20
2023-03-13 11:25:11,619 - harmonypy - INFO - Iteration 3 of 20
2023-03-13 11:25:11,619 - harmonypy - INFO - Iteration 3 of 20
2023-03-13 11:25:11,672 - harmonypy - INFO - Iteration 4 of 20
2023-03-13 11:25:11,672 - harmonypy - INFO - Iteration 4 of 20
2023-03-13 11:25:11,730 - harmonypy - INFO - Iteration 5 of 20
2023-03-13 11:25:11,730 - harmonypy - INFO - Iteration 5 of 20
2023-03-13 11:25:11,767 - harmonypy - INFO - Iteration 6 of 20
2023-03-13 11:25:11,767 - harmonypy - INFO - Iteration 6 of 20
2023-03-13 11:25:11,798 - harmonypy - INFO - Iteration 7 of 20
2023-03-13 11:25:11,798 - harmonypy - INFO - Iteration 7 of 20
2023-03-13 11:25:11,815 - harmonypy - INFO - Iteration 8 of 20
2023-03-13 11:25:11,815 - harmonypy - INFO - Iteration 8 of 20
2023-03-13 11:25:11,846 - harmonypy - INFO - Iteration 9 of 20
2023-03-13 11:25:11,846 - harmonypy - INFO - Iteration 9 of 20
2023-03-13 11:25:11,886 - harmonypy - INFO - Iteration 10 of 20
2023-03-13 11:25:11,886 - harmonypy - INFO - Iteration 10 of 20
2023-03-13 11:25:11,947 - harmonypy - INFO - Iteration 11 of 20
2023-03-13 11:25:11,947 - harmonypy - INFO - Iteration 11 of 20
2023-03-13 11:25:11,975 - harmonypy - INFO - Iteration 12 of 20
2023-03-13 11:25:11,975 - harmonypy - INFO - Iteration 12 of 20
2023-03-13 11:25:12,021 - harmonypy - INFO - Iteration 13 of 20
2023-03-13 11:25:12,021 - harmonypy - INFO - Iteration 13 of 20
2023-03-13 11:25:12,041 - harmonypy - INFO - Iteration 14 of 20
2023-03-13 11:25:12,041 - harmonypy - INFO - Iteration 14 of 20
2023-03-13 11:25:12,076 - harmonypy - INFO - Converged after 14 iterations
2023-03-13 11:25:12,076 - harmonypy - INFO - Converged after 14 iterations
WARNING: The title list is shorter than the number of panels. Using 'color' value instead for some plots.

Correlation 1.1 between cell types ¶

In [20]:
contrastCov = "group"




sc.tl.dendrogram(adata_bulk_DS, contrastCov, n_pcs=npcs)
sc.pl.correlation_matrix(adata_bulk_DS, contrastCov, figsize=(20,20), show_correlation_numbers=True, cmap="PRGn")

#sc.tl.dendrogram(adata_bulk_DS, 'group')





GroupCat = adata_bulk_DS.obs[contrastCov].cat.categories

BadhuriGroups = adata_bulk_DS.obs.loc[adata_bulk_DS.obs["dataset"] == "Badhuri",contrastCov].unique().tolist()
PolaroidsGroups = adata_bulk_DS.obs.loc[adata_bulk_DS.obs["dataset"] == "Polaroids",contrastCov].unique().tolist()

BadhuriGroupsOrdered = [i for i in adata_bulk_DS.uns["dendrogram_"+contrastCov]["categories_ordered"] if i in BadhuriGroups]

PolaroidsGroupsOrdered = [i for i in adata_bulk_DS.uns["dendrogram_"+contrastCov]["categories_ordered"] if i in PolaroidsGroups]



corrDF = pd.DataFrame(adata_bulk_DS.uns["dendrogram_"+contrastCov]["correlation_matrix"],
             columns=GroupCat,
             index=GroupCat)



corrDFTectangular = corrDF.loc[PolaroidsGroupsOrdered,BadhuriGroupsOrdered]
#plt.matshow(corrDFTectangular, cmap="bwr")


fig, ax = plt.subplots( figsize=(4, 4))
ax.pcolormesh(corrDFTectangular, cmap="PRGn")
ax.set_xlim(0, corrDFTectangular.shape[1])
ax.set_ylim(0, corrDFTectangular.shape[0])

ax.yaxis.tick_right()
ax.set_yticks(np.arange(corrDFTectangular.shape[0]) + 0.5)
ax.set_yticklabels(PolaroidsGroupsOrdered,size = 5)

#ax.xaxis.tick_right()
ax.xaxis.tick_top()
ax.xaxis.set_tick_params(labeltop=True)
ax.xaxis.set_tick_params(labelbottom=False)
ax.set_xticks(np.arange(corrDFTectangular.shape[1]) + 0.5)
ax.set_xticklabels(BadhuriGroupsOrdered, rotation=45, ha='left',size = 5)

#ax.set_xticks(corrDFTectangular.columns)
#ax.set_yticks(corrDFTectangular.index)
#ax.set_yticklabels(BadhuriGroupsOrdered)
#ax.set_xticklabels(PolaroidsGroupsOrdered)
for row, col in itertools.product(corrDFTectangular.index.tolist(),
                                  corrDFTectangular.columns.tolist()):
    ax.text(
        corrDFTectangular.columns.get_loc(col) + 0.5,
        corrDFTectangular.index.get_loc(row) + 0.5,
        f"{corrDFTectangular.loc[row, col]:.2f}",
        ha='center',
        va='center',
        size=4
    )
    
fig.savefig(figDir+"/correlation.pdf", bbox_inches='tight')
corrDFTectangular.to_csv(outdir+"/Correlations/Corr_PCA.tsv",sep="\t")
    using 'X_pca' with n_pcs = 10
Storing dendrogram info using `.uns['dendrogram_group']`
In [ ]: